This document illustrate how to load data from Elastic leveraging Spark Hadoop and Hive. Assuming all is already configured. the table as defined in hive looks like
CREATE EXTERNAL TABLE blueprism (
id STRING,
ts TIMESTAMP,
processprocessname STRING,
processname STRING,
stagename STRING,
actionname STRING,
runningresourcename STRING,
processduration FLOAT,
startdatetime TIMESTAMP,
duration FLOAT
)
ROW FORMAT SERDE 'org.elasticsearch.hadoop.hive.EsSerDe'
STORED BY 'org.elasticsearch.hadoop.hive.EsStorageHandler'
TBLPROPERTIES(
'es.nodes' = 'mbp15.local',
'es.resource' = 'blueprism.process.completed-*',
'es.read.metadata' = 'true',
'es.mapping.names' = 'id:_metadata._id, actionname:actionname, ts:@timestamp,processname:processname,stagename:stagename,runningresourcename:runningresourcename,processduration:processduration,startdatetime:startdatetime,duration:duration,processprocessname:processprocessname',
'es.scroll.size' = '10',
'es.mapping.id' = 'id');
# setting environment, to make it works with
spark_env = list('spark.executor.memory' = '2g',
'spark.executor.instances' = '4',
'spark.executor.cores' = '4',
'spark.driver.memory' = '1g',
'spark.cores.max' = '4',
'spark.total-executor-cores' = '4',
'spark.sql.globalTempDatabase'= 'testGlobal',
'spark.es.nodes'= 'MBP15.local',
'spark.serializer'= 'org.apache.spark.serializer.KryoSerializer',
'spark.eventLog.dir'= 'hdfs://mbp15.local:9000/tmp',
'spark.eventLog.enabled'= 'true',
'spark.driver.extraClassPath' = '/usr/local/Cellar/apache-spark/2.1.1/libexec/jars/*:/Users/rumi/Downloads/elasticsearch-hadoop-6.0.0/dist/elasticsearch-hadoop-6.0.0.jar')
Sys.setenv(SPARKR_SUBMIT_ARGS = "--master spark://MBP15.local:7077 --driver-memory 4g --total-executor-cores 8 --executor-memory 4g --num-executors 1 --driver-cores 1 --executor-cores 1 --deploy-mode client sparkr-shell")
library(SparkR)
sparkR.session.stop()
sparkR.session(appName = "RStudio-SparkHive-SpecificSetup", sparkHome = Sys.getenv("SPARK_HOME"),
sparkConfig = spark_env, enableHiveSupport = TRUE, sparkExecutorEnv = list(PATH = "/usr/local/sbin:/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin:/opt/X11/bin:/usr/local/Cellar/hadoop/2.7.3/bin:/usr/local/Cellar/hadoop/2.7.3/sbin",
SPARK_CLASSPATH = "/usr/local/Cellar/apache-spark/2.1.1/libexec/jars/*:/Users/hadoop/Downloads/bin:/Users/rumi/Downloads/bin:/Users/rumi/Downloads/elasticsearch-hadoop-6.0.0/dist/elasticsearch-hadoop-6.0.0.jar"))
## Launching java with spark-submit command /usr/local/Cellar/apache-spark/2.1.1/libexec/bin/spark-submit --driver-class-path "/usr/local/Cellar/apache-spark/2.1.1/libexec/jars/*:/Users/rumi/Downloads/elasticsearch-hadoop-6.0.0/dist/elasticsearch-hadoop-6.0.0.jar" --master spark://MBP15.local:7077 --driver-memory 4g --total-executor-cores 8 --executor-memory 4g --num-executors 1 --driver-cores 1 --executor-cores 1 --deploy-mode client sparkr-shell /var/folders/j9/4wqs8g_558l36ppwth2802940000gn/T//RtmpRCZxSS/backend_port716e419eab0b
## Java ref type org.apache.spark.sql.SparkSession id 1
setLogLevel("ERROR")
head(sql("show databases"))
head(sql("use rumi"))
head(sql("show tables"))
When running the first command in R studio, you will notice that nothing happen until the second command (in this case print ) is executed. Be careful this the lazy evaluation create sometime the perception that everythign goes well until it is truly evaluated. You will notice that th object bp created is of type SparkDataFrame, in order to use all R packages, it will have to be converted into a R dataframe.
bp <- sql("select processprocessname,processduration from blueprism where processduration is not null limit 100")
print(bp)
## SparkDataFrame[processprocessname:string, processduration:float]
bp_local <- collect(select(bp, "processprocessname", "processduration"))
head(bp_local)
library("ggplot2")
ggplot(data = bp_local, aes(x = processprocessname, ..count..)) + geom_bar(alpha = 0.9) +
xlab("processprocessname") + labs(title = "Top processes # execution ") +
theme(title = element_text(size = 10, face = "bold"), axis.text = element_text(size = 6))
test <- function(p) {
tmp <- paste0("select processprocessname,processduration from blueprism where processprocessname like \"",
p, "\" and duration is not null limit 10")
dd <- sql(tmp)
print(typeof(dd))
print(collect(dd))
}
test("Queue Step 1")
## [1] "S4"
## processprocessname processduration
## 1 Queue Step 1 0.130
## 2 Queue Step 1 0.130
## 3 Queue Step 1 0.183
## 4 Queue Step 1 0.123
## 5 Queue Step 1 0.110
## 6 Queue Step 1 0.153
## 7 Queue Step 1 0.130
## 8 Queue Step 1 0.117
## 9 Queue Step 1 0.126
## 10 Queue Step 1 0.287
Interesting type S4, please have a look to the documentation and you will discover that even though R seems to be easy with data type, it is becoming more complex when manipulating elaborated data structure! Enjoy :-)
# BP_1 explore how to construct dynamic SQL query on hive paste0 function
# concatene strings select function selects a set of columns with names or
# Column expressions. t function transpose vector or matrix
BP_1 <- function(p) {
t(collect(select(sql(paste0("select processduration from rumi.blueprism where processprocessname like \"",
p, "\" and duration is not null limit 10")), "processduration")))
}
# BP 2 runs a query with no null value in processduration as well as
# limiting the output at <p> elements.
BP_2 <- function(p) {
sql(paste0("select processprocessname,processduration,startdatetime from rumi.blueprism where processduration is not null limit ",
p))
}
# BP_2_nolimit when called with <0>, the query will run without limiting the
# number of records returned.
BP_2_nolimit <- function(p) {
sql(paste0("select * from rumi.blueprism where processduration is not null",
if (p > 0)
paste0(" limit ", p)))
}
BP_3 <- function(df) {
TaskNameList <- distinct(select(df, "processprocessname")) # get list of all possible tasks
dapply(TaskNameList, function(p) {
select(filter(df, df$processprocessname == p), "processduration")
}, schema(df))
}
BP_4_nolimit <- function(p) {
sql(paste0("select processprocessname,processduration,startdatetime from rumi.blueprism where processduration is not null and startdatetime between \"2017-09-22 07:48:22.000\" and \"2017-09-22 09:21:54.000\"",
if (p > 0)
paste0(" limit ", p)))
}
# cache SparkDataFrame containing 10000 records from blueprism index in
# elastic
SPDF_QP <- cache(BP_2_nolimit(0))
# add column with conversion of Date in Julian Format
SPDF_QP$julian <- date_format(SPDF_QP$startdatetime, "yyyyD")
# move SparkDataFrame to R DataFrame We need to be careful to have enough
# memory locally to stored the data when executing the following command you
# will notice the lazy evaluation built in.
QP <- collect(select(filter(SPDF_QP, "startdatetime between \"2017-10-10 21:11:44.375\" and \"2017-10-17 18:24:55.379\""),
"processprocessname", "processduration"))
# select from the dataframe the list of different processname this is going
# to be helpful when applying processing using functional map on list or
# dataframe
tlist <- as.list(collect(distinct(select(SPDF_QP, "processprocessname")))[,
1])
# however the precedent query is executed on the complete dataset, it means
# there maybe process that are not included in the QP dataset this can
# further harm the mapping process later select unique element in the R list
# and return a R list
tlist <- unique(QP$processprocessname)
ER_Plot1 <- function(x,chart_title="no title"){
# in case a row is empty, hist will raise an exception, it needs to be tested
if (NROW(x) > 0) hist(x,main= paste("Duration stats Process:",chart_title), xlab="Seconds",cex.main=1,xlim=c(0,mean(x)+6*sd(x)))
}
ER_Plot2 <- function(x,chart_title="no title"){
# compare to normal distribution
if (NROW(x) > 0) {
h<-hist(x, breaks=10, col="red", xlab="Seconds", cex.main=1,
main=paste("Duration stats Process:",chart_title))
xfit<-seq(min(x),max(x),length=40)
yfit<-dnorm(xfit,mean=mean(x),sd=sd(x))
yfit <- yfit*diff(h$mids[1:2])*length(x)
ypfit<-pnorm(xfit,mean=mean(x),sd=sd(x))
ypfit <- ypfit*diff(h$mids[1:2])*length(x)
#yqfit<-qnorm(xfit,mean=mean(x),sd=sd(x))
#yqfit <- yqfit*diff(h$mids[1:2])*length(x)
#yrfit<-rnorm(xfit,mean=mean(x),sd=sd(x))
#yrfit <- yrfit*diff(h$mids[1:2])*length(x)
lines(xfit, yfit, col="blue", lwd=2)
lines(xfit, ypfit, col="green", lwd=2)
#lines(xfit, yqfit, col="cyan", lwd=2)
#lines(xfit, yrfit, col="magenta", lwd=2)
}
}
ER_Plot3 <- function(p,chart_title="no title"){
if (NROW(p) > 0) {
ggplot(NULL,aes(p,y=..count..),cex.main=1) + xlim(0,mean(p)+6*sd(p)) +
geom_histogram(aes(y = ..density..)) +
# aes(fill=..count..)) +
scale_y_sqrt() +
geom_density(aes(y = ..density..),colour="black",linetype=1,size=.2,fill=hcl(100,180,70,.3),adjust=5) +
# stat_density(adjust=mean(p)) +
labs(title= paste("Duration stats Process:",chart_title)) +
labs(x="Seconds")}
}
ER_Plot4 <- function(p,chart_title="no title"){
if (NROW(p) > 0) {
newplot <- ggplot(NULL,aes(p,y=..count..),cex.main=1) +
geom_histogram(aes(y = ..density..))+
#aes(fill=..count..)) +
scale_y_sqrt() +
geom_density(aes(y = ..density..),colour="red",linetype=3,size=.2,fill="gray98",alpha=.4) +
# stat_density(adjust=mean(p)) +
labs(title= paste("Duration stats Process:",chart_title)) +
labs(x="Seconds") + theme(title =element_text(size=6, face='bold'))
scale_fill_gradient("Count", low = "green", high = "red")
return(newplot)
}
return(NULL)
}
## [[1]]
## $breaks
## [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0 7.5
##
## $counts
## [1] 2520 2380 24152 3231 24262 2118 1413 3476 140 493 70
## [12] 4756 213 142
##
## $density
## [1] 0.072658075 0.068621515 0.696364213 0.093158031 0.699535796
## [6] 0.061067382 0.040740420 0.100222011 0.004036560 0.014214457
## [11] 0.002018280 0.137127699 0.006141337 0.004094225
##
## $mids
## [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75 6.25 6.75 7.25
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[2]]
## $breaks
## [1] 0 5000 10000 15000 20000 25000 30000 35000 40000 45000 50000
## [12] 55000 60000
##
## $counts
## [1] 81336 0 0 0 14 0 0 0 0 0 0
## [12] 10
##
## $density
## [1] 1.999410e-04 0.000000e+00 0.000000e+00 0.000000e+00 3.441495e-08
## [6] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [11] 0.000000e+00 2.458210e-08
##
## $mids
## [1] 2500 7500 12500 17500 22500 27500 32500 37500 42500 47500 52500
## [12] 57500
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[3]]
## $breaks
## [1] 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21
##
## $counts
## [1] 18538 310 62 124 62 0 0 0 0 0 0
## [12] 0 0 0 0 62
##
## $density
## [1] 0.967637540 0.016181230 0.003236246 0.006472492 0.003236246
## [6] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [11] 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [16] 0.003236246
##
## $mids
## [1] 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5 13.5 14.5 15.5 16.5 17.5 18.5
## [15] 19.5 20.5
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[4]]
## $breaks
## [1] 0 5000 10000 15000 20000 25000 30000 35000 40000 45000 50000
## [12] 55000 60000
##
## $counts
## [1] 34649 0 0 0 0 0 0 0 0 0 0
## [12] 6
##
## $density
## [1] 1.999654e-04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [6] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [11] 0.000000e+00 3.462704e-08
##
## $mids
## [1] 2500 7500 12500 17500 22500 27500 32500 37500 42500 47500 52500
## [12] 57500
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[5]]
## $breaks
## [1] 0 2000 4000 6000 8000 10000 12000 14000 16000 18000 20000
## [12] 22000 24000 26000
##
## $counts
## [1] 12846 0 0 0 0 0 0 0 0 0 0
## [12] 0 19
##
## $density
## [1] 4.992616e-04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [6] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [11] 0.000000e+00 0.000000e+00 7.384376e-07
##
## $mids
## [1] 1000 3000 5000 7000 9000 11000 13000 15000 17000 19000 21000
## [12] 23000 25000
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[6]]
## $breaks
## [1] 0.00 0.05 0.10 0.15 0.20 0.25 0.30 0.35 0.40 0.45 0.50 0.55 0.60 0.65
## [15] 0.70 0.75 0.80 0.85
##
## $counts
## [1] 420 1332 132 122 60 65 52 26 34 26 17 0 10 5
## [15] 5 0 5
##
## $density
## [1] 3.63479013 11.52747728 1.14236261 1.05581999 0.51925573
## [6] 0.56252704 0.45002164 0.22501082 0.29424492 0.22501082
## [11] 0.14712246 0.00000000 0.08654262 0.04327131 0.04327131
## [16] 0.00000000 0.04327131
##
## $mids
## [1] 0.025 0.075 0.125 0.175 0.225 0.275 0.325 0.375 0.425 0.475 0.525
## [12] 0.575 0.625 0.675 0.725 0.775 0.825
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[7]]
## $breaks
## [1] 0.0 0.2 0.4 0.6 0.8 1.0 1.2 1.4 1.6 1.8 2.0 2.2
##
## $counts
## [1] 792 234 60 12 12 6 0 0 0 0 6
##
## $density
## [1] 3.52941176 1.04278075 0.26737968 0.05347594 0.05347594 0.02673797
## [7] 0.00000000 0.00000000 0.00000000 0.00000000 0.02673797
##
## $mids
## [1] 0.1 0.3 0.5 0.7 0.9 1.1 1.3 1.5 1.7 1.9 2.1
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[8]]
## $breaks
## [1] 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5 6.0 6.5 7.0
##
## $counts
## [1] 9801 2607 143 66 55 33 0 0 0 11 0 0 0 11
##
## $density
## [1] 1.540190147 0.409680207 0.022471910 0.010371651 0.008643042
## [6] 0.005185825 0.000000000 0.000000000 0.000000000 0.001728608
## [11] 0.000000000 0.000000000 0.000000000 0.001728608
##
## $mids
## [1] 0.25 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25 5.75 6.25 6.75
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[9]]
## $breaks
## [1] 52 54 56 58 60 62 64 66 68 70 72 74 76 78 80 82 84 86 88 90 92 94 96
##
## $counts
## [1] 4047 13845 852 852 426 213 0 0 0 0 0
## [12] 213 0 0 426 426 0 0 0 213 0 213
##
## $density
## [1] 0.093137255 0.318627451 0.019607843 0.019607843 0.009803922
## [6] 0.004901961 0.000000000 0.000000000 0.000000000 0.000000000
## [11] 0.000000000 0.004901961 0.000000000 0.000000000 0.009803922
## [16] 0.009803922 0.000000000 0.000000000 0.000000000 0.004901961
## [21] 0.000000000 0.004901961
##
## $mids
## [1] 53 55 57 59 61 63 65 67 69 71 73 75 77 79 81 83 85 87 89 91 93 95
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[10]]
## $breaks
## [1] 0 200 400 600 800 1000 1200 1400
##
## $counts
## [1] 4 0 0 0 0 0 7
##
## $density
## [1] 0.001818182 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## [7] 0.003181818
##
## $mids
## [1] 100 300 500 700 900 1100 1300
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[11]]
## $breaks
## [1] 0 1000 2000 3000 4000 5000 6000 7000 8000 9000 10000
## [12] 11000 12000 13000 14000 15000 16000 17000 18000
##
## $counts
## [1] 31789 0 0 0 0 0 0 0 0 0 0
## [12] 0 0 0 0 0 0 54
##
## $density
## [1] 9.983042e-04 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [6] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [11] 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00
## [16] 0.000000e+00 0.000000e+00 1.695820e-06
##
## $mids
## [1] 500 1500 2500 3500 4500 5500 6500 7500 8500 9500 10500
## [12] 11500 12500 13500 14500 15500 16500 17500
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
##
## [[12]]
## $breaks
## [1] 0.5 1.0 1.5 2.0 2.5 3.0 3.5 4.0 4.5 5.0 5.5
##
## $counts
## [1] 24 96 24 0 0 48 0 480 96 24
##
## $density
## [1] 0.06060606 0.24242424 0.06060606 0.00000000 0.00000000 0.12121212
## [7] 0.00000000 1.21212121 0.24242424 0.06060606
##
## $mids
## [1] 0.75 1.25 1.75 2.25 2.75 3.25 3.75 4.25 4.75 5.25
##
## $xname
## [1] "x"
##
## $equidist
## [1] TRUE
##
## attr(,"class")
## [1] "histogram"
## [[1]]
## NULL
##
## [[2]]
## NULL
##
## [[3]]
## NULL
##
## [[4]]
## NULL
##
## [[5]]
## NULL
##
## [[6]]
## NULL
##
## [[7]]
## NULL
##
## [[8]]
## NULL
##
## [[9]]
## NULL
##
## [[10]]
## NULL
##
## [[11]]
## NULL
##
## [[12]]
## NULL
## [[1]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## [[2]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 24 rows containing non-finite values (stat_bin).
## Warning: Removed 24 rows containing non-finite values (stat_density).
##
## [[3]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 62 rows containing non-finite values (stat_bin).
## Warning: Removed 62 rows containing non-finite values (stat_density).
##
## [[4]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_density).
##
## [[5]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 19 rows containing non-finite values (stat_bin).
## Warning: Removed 19 rows containing non-finite values (stat_density).
## Warning: Removed 1 rows containing missing values (geom_bar).
##
## [[6]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 5 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing non-finite values (stat_density).
##
## [[7]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6 rows containing non-finite values (stat_bin).
## Warning: Removed 6 rows containing non-finite values (stat_density).
##
## [[8]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 55 rows containing non-finite values (stat_bin).
## Warning: Removed 55 rows containing non-finite values (stat_density).
##
## [[9]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## [[10]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## [[11]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 54 rows containing non-finite values (stat_bin).
## Warning: Removed 54 rows containing non-finite values (stat_density).
##
## [[12]]
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# interesting usage of grepl - logical grep as opposed grep which return the
# result of the grep applied
QP_Check <- subset.data.frame(QP, grepl("*Hello*", processprocessname))
head(QP_Check)
hist(QP_Check$processduration)
ER_Plot2(QP_Check$processduration, "test")
ER_Plot4(QP_Check$processduration, "test")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
x <- QP_Check$processduration
h <- hist(x, breaks = 10, col = "red", xlab = "Miles Per Gallon", xlim = c(0,
mean(x) + 2 * sd(x)), main = "Histogram with Normal Curve")
xfit <- seq(min(x), max(x), length = 40)
yfit <- dbinom(seq(1, length = 40), 100, 0.1)
yfit <- yfit * length(x)
lines(xfit, yfit, col = "blue", lwd = 2)
{
hist(subset.data.frame(QP, processduration < 500)$processduration)
density(subset.data.frame(QP, processduration < 500)$processduration)
grid.arrange(ggplot(QP, aes(QP$processduration)) + geom_histogram() + labs(title = paste("Default")) +
theme(title = element_text(size = 8, face = "bold")), ggplot(QP, aes(QP$processduration)) +
geom_histogram(breaks = seq(0, 1000, by = 100)) + labs(title = paste("0 to 1000 by 100")) +
theme(title = element_text(size = 8, face = "bold")), ggplot(QP, aes(QP$processduration)) +
geom_histogram(breaks = seq(0, 200, by = 10)) + labs(title = paste("0 to 200 by 10")) +
theme(title = element_text(size = 8, face = "bold")), ggplot(QP, aes(QP$processduration)) +
geom_histogram(breaks = seq(0, 100, by = 10)) + labs(title = paste("0 to 100 by 10")) +
theme(title = element_text(size = 8, face = "bold")), ggplot(QP, aes(QP$processduration)) +
geom_histogram(breaks = seq(0, 5, by = 0.5)) + labs(title = paste("0 to 5 by .5")) +
theme(title = element_text(size = 8, face = "bold")), ggplot(QP, aes(QP$processduration)) +
geom_histogram(breaks = seq(0, 5, by = 0.1)) + labs(title = paste("0 to 5 by .1")) +
theme(title = element_text(size = 8, face = "bold")))
density(QP$processduration)
}
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
##
## Call:
## density.default(x = QP$processduration)
##
## Data: QP$processduration (287936 obs.); Bandwidth 'bw' = 0.2035
##
## x y
## Min. : -0.57 Min. :0.000000
## 1st Qu.:14389.33 1st Qu.:0.000000
## Median :28779.24 Median :0.000000
## Mean :28779.24 Mean :0.003803
## 3rd Qu.:43169.15 3rd Qu.:0.000000
## Max. :57559.05 Max. :1.782271
QP2 <- collect(filter(SPDF_QP, "duration is not null and startdatetime between \"2017-08-13 05:00:59.016\" and \"2017-09-01 00:00:00.000\""))
head(QP2)
# x <- subset.data.frame(QP2,processduration < 100 & processduration >0 &
# grepl('*',processprocessname))
x <- subset.data.frame(QP2, grepl("*", processprocessname))
plot(x$julian, x$processduration, main = "Scater Plot", col = rgb(0, 100, 0,
50, maxColorValue = 255), pch = 21)
library("plot3D")
xx <- julian.l <- as.numeric(x$julian)
yy <- processduration.l <- as.numeric(x$processduration)
zz <- duration <- as.numeric(x$duration) * 100
scatter2D(xx, yy, colvar = NULL, col = NULL, add = FALSE)
scatter3D(xx, yy, zz, clab = c("julian", "processduration", "stageduration"))
scatter3D(xx, yy, zz)
library(scatterplot3d)
scatterplot3d(x$julian, x$processduration, x$duration, pch = 16, highlight.3d = TRUE,
type = "h", main = "3D Scatterplot")
When using more and more spark, it will not necessarily be a valid option to copy the SparkDataFrame into R dataframe : it may not fit into the local memory space in addition to make the multiple copies redundant and heavy to process in io.
# sparkR.session.stop()